/*
This file should be run independently and before Main.do if using data for China. No need to run if using data for Mexico.

The code imports each csv file in the directory as a .dta file and renames the relevant variables, combines files together into a single dataset for year 2003 for exporters and manufacturing firms. It merges the two files and creates a China.dta file that is supplied to Main.do

Place the data in csv format into two subdirectories in a designated "raw_data folder": `raw_data'\2003firm\csv_files and `raw_data'\2003customsExport\csv_files"

Set up a final data directory as the "\Replication Package\Data-Stata" which is the directory from which Main.do reads .dta files
*/

local raw_data  "\Data\2003original" // change the path to the folder where you keep the raw data in csv format; this should contain 2003customsExport and 2003firm subfolders, each containing csv_files folder with the corresponding csv files respectively

local final_data  "\Replication Package\Data-Stata" // change the path to the folder where you keep the the final dataset that is used in Main.do


* Customs data pre-processing
/*import all the csv files and combine it into a single dataset containing data on 
customs id, destination, hs group, year and sales variables.
*/

local customs_csv = "`raw_data'\2003customsExport\csv_files" //contains all the csv files with customs records, change the path accordingly 

local customs_dta "`raw_data'\2003customsExport\dta_files" // store dta files here, change the path accordingly

capture mkdir "`customs_dta'"

* List all CSV files in the directory
local csv_files: dir "`customs_csv'" files "*.csv"

* Loop through each CSV file
foreach file of local csv_files {
    import delimited "`customs_csv'\\`file'", clear
    
	    rename 统计时间 y
	
		rename 海关企业标识码 f
		tostring f, replace
	
		rename 商品编码 hs8
		tostring(hs8), replace
		gen hs =substr(hs8,1,6)
	
		rename 国别代码 d
		tostring d,replace

		rename 金额美元 v
	
		keep y f hs d v
		
		* Save the modified data in the new directory as a .dta file
		* Extract the file name without extension for saving
		local dta_name = substr("`file'", 1, strpos("`file'", ".") - 1)
		save "`customs_dta'\\`dta_name'.dta",replace
	
}

// Combine all the dta files into 1 
* Create a list of all files in the directory
local all_files: dir "`customs_dta'" files "*.dta"

* Initialize a counter to identify the first file
local counter = 1

* Loop through the files
foreach file of local all_files {
    * Check if it's the first file
    if `counter' == 1 {
        * Open the first file
        use "`customs_dta'\\`file'", clear
        display "Opened the first file: `file'"
        * Increment the counter after opening the first file
        local counter = `counter' + 1
    }
    else {
        append using "`customs_dta'\\`file'"
    }
}

collapse (sum) v , by(f y d hs)

save "`raw_data'\\customs_2003.dta",replace

/*Firm Data-Stata
identify the firms that are also in the in the industrial survey and create file "matched_customs_firms_ids.dta"
*/

local firms_csv = "`raw_data'\2003firm\csv_files" //contains all the csv files with firm level data together with the corresponding customs_id 

local firms_dta "`raw_data'\2003firm/dta_files" // store dta files with firm information here, change the path accordingly
capture mkdir "`firms_dta'"

* List all CSV files in the directory
local csv_files: dir "`firms_csv'" files "*.csv"

* Loop through each CSV file
foreach file of local csv_files {
    * Import the CSV file
	display "`firms_csv'\\`file'"
    import delimited  "`firms_csv'\\`file'", clear
	
	rename 工业企业标识码 f_is
	rename 海关企业标识码 f
	
	* Save the modified data in the new directory as a .dta file
    * Extract the file name without extension for saving
    local dta_name = substr("`file'", 1, strpos("`file'", ".") - 1)
    save "`firms_dta'\\`dta_name'.dta",replace
}


// Combine all the dta files into 1 
* Create a list of all files in the directory
local all_files: dir "`firms_dta'" files "*.dta"

* Initialize a counter to identify the first file
local counter = 1

* Loop through the files
foreach file of local all_files {
    * Check if it's the first file
    if `counter' == 1 {
        * Open the first file
        use "`firms_dta'\\`file'", clear
        display "Opened the first file: `file'"
        * Increment the counter after opening the first file
        local counter = `counter' + 1
    }
    else {
        append using "`firms_dta'\\`file'"
    }
}

//keep unique firms
bysort f f_is: gen dups =_n
keep if dups==1
drop dups
save "`raw_data'\\firms_2003.dta",replace

// only keep data for exporters which had a match in the firms dataset
use "`raw_data'\\customs_2003.dta",clear 
merge m:1 f using"`raw_data'\\firms_2003.dta"
keep if _merge==3
drop _merge f_is
save "`final_data'\\China.dta",replace



